In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

print("โœ… All libraries loaded.")
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("Seaborn:", sns.__version__)
print("SciPy:", scipy.__version__)
โœ… All libraries loaded.
NumPy: 1.26.4
Pandas: 2.3.0+4.g1dfc98e16a
Seaborn: 0.11.2
SciPy: 1.13.1
In [2]:
df = pd.read_csv('water_potability.csv')
df.head(50)
Out[2]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
5 5.584087 188.313324 28748.687739 7.544869 326.678363 280.467916 8.399735 54.917862 2.559708 0
6 10.223862 248.071735 28749.716544 7.513408 393.663396 283.651634 13.789695 84.603556 2.672989 0
7 8.635849 203.361523 13672.091764 4.563009 303.309771 474.607645 12.363817 62.798309 4.401425 0
8 NaN 118.988579 14285.583854 7.804174 268.646941 389.375566 12.706049 53.928846 3.595017 0
9 11.180284 227.231469 25484.508491 9.077200 404.041635 563.885481 17.927806 71.976601 4.370562 0
10 7.360640 165.520797 32452.614409 7.550701 326.624353 425.383419 15.586810 78.740016 3.662292 0
11 7.974522 218.693300 18767.656682 8.110385 NaN 364.098230 14.525746 76.485911 4.011718 0
12 7.119824 156.704993 18730.813653 3.606036 282.344050 347.715027 15.929536 79.500778 3.445756 0
13 NaN 150.174923 27331.361962 6.838223 299.415781 379.761835 19.370807 76.509996 4.413974 0
14 7.496232 205.344982 28388.004887 5.072558 NaN 444.645352 13.228311 70.300213 4.777382 0
15 6.347272 186.732881 41065.234765 9.629596 364.487687 516.743282 11.539781 75.071617 4.376348 0
16 7.051786 211.049406 30980.600787 10.094796 NaN 315.141267 20.397022 56.651604 4.268429 0
17 9.181560 273.813807 24041.326280 6.904990 398.350517 477.974642 13.387341 71.457362 4.503661 0
18 8.975464 279.357167 19460.398131 6.204321 NaN 431.443990 12.888759 63.821237 2.436086 0
19 7.371050 214.496610 25630.320037 4.432669 335.754439 469.914551 12.509164 62.797277 2.560299 0
20 NaN 227.435048 22305.567414 10.333918 NaN 554.820086 16.331693 45.382815 4.133423 0
21 6.660212 168.283747 30944.363591 5.858769 310.930858 523.671298 17.884235 77.042318 3.749701 0
22 NaN 215.977859 17107.224226 5.607060 326.943978 436.256194 14.189062 59.855476 5.459251 0
23 3.902476 196.903247 21167.500099 6.996312 NaN 444.478883 16.609033 90.181676 4.528523 0
24 5.400302 140.739062 17266.593422 10.056852 328.358241 472.874073 11.256381 56.931906 4.824786 0
25 6.514415 198.767351 21218.702871 8.670937 323.596349 413.290450 14.900000 79.847843 5.200885 0
26 3.445062 207.926260 33424.768678 8.782147 384.007006 441.785876 13.805902 30.284597 4.184397 0
27 NaN 145.768181 13224.935639 7.906445 304.001993 298.990666 12.729525 49.536849 4.004871 0
28 NaN 266.421018 26362.965012 7.700063 395.389490 364.480107 10.348951 53.008381 3.991564 0
29 NaN 148.153061 15193.413474 9.046833 307.011793 563.804743 16.568656 52.676185 6.038185 0
30 7.181449 209.625601 15196.229987 5.994679 338.336431 342.111286 7.922598 71.537953 5.088860 0
31 9.825490 190.756618 19677.892466 6.757541 NaN 452.836235 16.899038 47.081971 2.857472 0
32 10.433291 117.791230 22326.892046 8.161505 307.707509 412.986834 12.890709 65.733478 5.057311 0
33 7.414148 235.044534 32555.852537 6.845952 387.175316 411.983364 10.244815 44.489297 3.160624 0
34 NaN 232.280452 14787.206265 5.474915 NaN 383.981723 12.166937 86.080727 5.029167 0
35 5.115817 191.952743 19620.545329 6.060713 323.836384 441.748379 10.966486 49.238231 3.902089 0
36 3.641630 183.908722 24752.072460 5.538314 286.059556 456.860096 9.034067 73.594657 3.464353 0
37 5.618064 304.235912 17281.975168 6.101084 NaN 399.471566 12.265002 81.588992 2.896547 0
38 NaN 143.453731 19942.273218 5.890755 NaN 427.130671 22.469892 53.124094 2.907564 0
39 9.267188 198.614395 24683.723566 6.110612 328.077533 396.876949 16.471969 30.383315 4.324005 0
40 NaN 233.858996 11703.923907 4.599388 309.039320 349.399633 18.338893 42.677465 3.510004 0
41 5.331940 194.874065 16658.876503 7.993830 316.675162 335.120398 10.180514 59.572714 4.434820 0
42 7.145772 238.689929 28780.340432 6.814029 385.975650 332.032706 11.093163 66.138045 5.182591 0
43 9.920691 202.817483 9973.934059 6.882248 337.350529 333.192470 23.917601 71.833624 4.690707 0
44 4.758439 183.349454 21568.428779 4.731349 NaN 403.944168 18.668229 66.912400 4.542801 0
45 5.702926 216.850474 35606.440177 7.184351 NaN 504.638260 16.140790 77.536184 4.137739 0
46 6.953864 209.638293 10575.186281 4.462707 315.606594 391.184315 13.285334 87.390889 3.195710 0
47 10.682966 173.375498 15758.740621 5.570784 307.352586 323.807913 10.090870 78.472784 3.999775 0
48 NaN 129.890572 34415.853146 6.321929 304.535224 470.329169 18.599410 72.403634 4.405586 0
49 8.757257 200.191400 21536.224687 4.915101 317.882900 404.717799 13.768323 47.930872 3.626135 0
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [4]:
df[df['Potability'] == 1]
Out[4]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
250 9.445130 145.805402 13168.529156 9.444471 310.583374 592.659021 8.606397 77.577460 3.875165 1
251 9.024845 128.096691 19859.676476 8.016423 300.150377 451.143481 14.770863 73.778026 3.985251 1
252 NaN 169.974849 23403.637304 8.519730 NaN 475.573562 12.924107 50.861913 2.747313 1
253 6.800119 242.008082 39143.403329 9.501695 187.170714 376.456593 11.432466 73.777275 3.854940 1
254 7.174135 203.408935 20401.102461 7.681806 287.085679 315.549900 14.533510 74.405616 3.939896 1
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

1278 rows ร— 10 columns

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [6]:
# check for duplicated rows 
df.duplicated().sum()
Out[6]:
0
In [7]:
df.describe()
Out[7]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000
mean 7.080795 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.594320 32.879761 8768.570828 1.583085 41.416840 80.824064 3.308162 16.175008 0.780382 0.487849
min 0.000000 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.093092 176.850538 15666.690297 6.127421 307.699498 365.734414 12.065801 55.844536 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.622485 3.955028 0.000000
75% 8.062066 216.667456 27332.762127 8.114887 359.950170 481.792304 16.557652 77.337473 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [8]:
df.isna().sum()
Out[8]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [9]:
df.dtypes
Out[9]:
ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object
  • Exploration Summary
  • we have a dataframe consisting of 3276 rows and 10 columns.
  • database have some null values , we need to remove them.
  • ph value cant be 0 , we need to replace it with median.
  • we can remove Potability (0,1) , and add potability level and potability score.

Data Cleaning¶

In [10]:
df['ph'].fillna(df['ph'].median(), inplace=True)
df['Sulfate'].fillna(df['Sulfate'].median(), inplace=True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1952783639.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ph'].fillna(df['ph'].median(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1952783639.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sulfate'].fillna(df['Sulfate'].median(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1952783639.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)
In [11]:
df.isna().sum()
Out[11]:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64
In [12]:
df[df['Potability'] == 1]
Out[12]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
250 9.445130 145.805402 13168.529156 9.444471 310.583374 592.659021 8.606397 77.577460 3.875165 1
251 9.024845 128.096691 19859.676476 8.016423 300.150377 451.143481 14.770863 73.778026 3.985251 1
252 7.036752 169.974849 23403.637304 8.519730 333.073546 475.573562 12.924107 50.861913 2.747313 1
253 6.800119 242.008082 39143.403329 9.501695 187.170714 376.456593 11.432466 73.777275 3.854940 1
254 7.174135 203.408935 20401.102461 7.681806 287.085679 315.549900 14.533510 74.405616 3.939896 1
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 333.073546 392.449580 19.903225 66.396293 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 333.073546 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 333.073546 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 333.073546 327.459760 16.140368 78.698446 2.309149 1

1278 rows ร— 10 columns

In [13]:
# Example: pH should not be 0 in real water
df[df['ph'] == 0]
Out[13]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
3014 0.0 214.846144 49456.587108 7.897539 333.073546 583.448849 7.702328 77.712891 4.92884 0
In [14]:
df.loc[df['ph'] == 0, 'ph'] = np.nan
df['ph'].fillna(df['ph'].median(), inplace=True)
C:\Users\asus\AppData\Local\Temp\ipykernel_16508\1201209032.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ph'].fillna(df['ph'].median(), inplace=True)
In [15]:
df[df['ph'] == 0]
Out[15]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
In [16]:
df.describe()
Out[16]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000
mean 7.076341 196.369496 22014.092526 7.122277 333.608364 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.464832 32.879761 8768.570828 1.583085 36.143851 80.824064 3.308162 15.769881 0.780382 0.487849
min 0.227499 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.279317 176.850538 15666.690297 6.127421 317.094638 365.734414 12.065801 56.647656 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.396293 3.955028 0.000000
75% 7.870050 216.667456 27332.762127 8.114887 350.385756 481.792304 16.557652 76.666609 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000

Data Visualization¶

What proportion of the dataset represents potable (safe) water samples?¶

In [17]:
# Total number of rows
total_samples = len(df)

# Number of safe water samples (Potability = 1)
safe_samples = df[df['Potability'] == 1].shape[0]

# Calculate percentage
safe_percentage = (safe_samples / total_samples) * 100

print(f"Safe water samples: {safe_samples} out of {total_samples} ({safe_percentage:.2f}%)")
Safe water samples: 1278 out of 3276 (39.01%)

How many samples fall into each potability level category?¶

In [18]:
# Step 1: Remove old Potability column
df.drop(columns=['Potability'], inplace=True)

# Step 2: Scoring Function (based on WHO recommended ranges)
def calculate_potability_score(row):
    score = 0
    
    if 6.5 <= row['ph'] <= 8.5: score += 15
    if row['Solids'] < 500: score += 10
    if row['Chloramines'] < 4: score += 10
    if row['Sulfate'] < 250: score += 10
    if row['Trihalomethanes'] < 80: score += 10
    if row['Organic_carbon'] < 5: score += 10
    if row['Conductivity'] < 1500: score += 10
    if row['Turbidity'] < 5: score += 10
    if row['Hardness'] < 120: score += 5
    
    return score

# Step 3: Apply score to each row
df['Potability_Score'] = df.apply(calculate_potability_score, axis=1)

# Step 4: Map score into categories
def map_potability_level(score):
    if score < 41:
        return 'Very Poor'
    elif score < 61:
        return 'Poor'
    elif score < 76:
        return 'Average'
    elif score < 91:
        return 'Good'
    else:
        return 'Very Good'

df['Potability_Level'] = df['Potability_Score'].apply(map_potability_level)

# Step 5: Ensure correct data types
df['Potability_Score'] = df['Potability_Score'].astype(int)
df['Potability_Level'] = df['Potability_Level'].astype('category')
In [19]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
sns.countplot(x='Potability_Level', data=df, palette='Spectral', order=df['Potability_Level'].value_counts().index)
plt.title("Distribution of Water Quality Levels")
plt.xlabel("Potability Level")
plt.ylabel("Number of Samples")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:253: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_vals = vals.groupby(grouper)

Which chemical is most responsible for very poor water quality?¶

In [20]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

# Step 1: Only keep very poor water samples
very_poor_df = df[df["Potability_Level"] == "Very Poor"]

# Step 2: Select only chemical columns
chemical_cols = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate',
                 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']

# Step 3: Normalize all columns to 0โ€“10 scale using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 10))
normalized_data = scaler.fit_transform(very_poor_df[chemical_cols])

# Step 4: Take mean of each chemical (for Very Poor category)
mean_normalized = pd.DataFrame(normalized_data, columns=chemical_cols).mean().sort_values(ascending=False)

# Step 5: Create Plotly bar chart
fig = px.bar(
    x=mean_normalized.values,
    y=mean_normalized.index,
    orientation='h',
    labels={'x': 'Normalized Mean Value (0โ€“10)', 'y': 'Chemical'},
    title='Normalized Chemical Levels in Very Poor Water Samples',
    text=mean_normalized.round(2)
)

fig.update_traces(marker_color='indianred', textposition='outside')
fig.update_layout(xaxis_range=[0, 10])

fig.show()

Make a ML model to predict the water quality and score by just giving the custom values¶

In [21]:
# ๐Ÿ“ฆ 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# ๐Ÿ“‚ 2. Load and Clean Dataset
df = pd.read_csv("water_potability.csv")
df.fillna(df.median(numeric_only=True), inplace=True)

# ๐Ÿ“Š 3. Define Custom Scoring Function (Safe Bonus + Danger Penalty)
def custom_score(row):
    score = 0

    # โœ… Bonus: Safe ranges = +10 each
    if 6.5 <= row['ph'] <= 8.5: score += 10
    if row['Hardness'] <= 300: score += 10
    if row['Solids'] <= 5000: score += 10
    if row['Chloramines'] <= 4: score += 10
    if row['Sulfate'] <= 250: score += 10
    if row['Conductivity'] <= 800: score += 10
    if row['Organic_carbon'] <= 10: score += 10
    if row['Trihalomethanes'] <= 80: score += 10
    if row['Turbidity'] <= 5: score += 10

    # โŒ Penalty: More dangerous = more negative score
    def penalty(value, limit, factor=1):
        return max(0, (value - limit) // (limit * factor) + 1) * 10

    if row['Trihalomethanes'] > 80:
        score -= penalty(row['Trihalomethanes'], 80, factor=0.5)
    if row['Turbidity'] > 5:
        score -= penalty(row['Turbidity'], 5, factor=0.2)
    if row['Chloramines'] > 4:
        score -= penalty(row['Chloramines'], 4, factor=0.5)
    if row['Sulfate'] > 250:
        score -= penalty(row['Sulfate'], 250, factor=0.5)
    if row['Conductivity'] > 800:
        score -= penalty(row['Conductivity'], 800, factor=0.5)
    if row['Solids'] > 5000:
        score -= penalty(row['Solids'], 5000, factor=1)
    if row['Organic_carbon'] > 10:
        score -= penalty(row['Organic_carbon'], 10, factor=0.5)

    if 6.5 <= row['ph'] <= 8.5:
        score += 10
    else:
        score -= 10

    if row['Hardness'] > 300:
        score -= 10

    return max(score, 0)

# ๐Ÿงฎ 4. Apply Scoring
df['Potability_Score'] = df.apply(custom_score, axis=1)

# ๐Ÿ“ˆ 5. Categorize Potability Level
def categorize(score):
    if score >= 80:
        return "Very Good"
    elif score >= 60:
        return "Good"
    elif score >= 40:
        return "Average"
    elif score >= 20:
        return "Poor"
    else:
        return "Very Poor"

df['Potability_Level'] = df['Potability_Score'].apply(categorize)

# ๐Ÿ“š 6. Split Dataset for Any Further Use (e.g., model validation if needed)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# ๐Ÿค– 7. Function to Predict from User Input
def predict_potability(user_input):
    row = pd.Series(user_input)
    score = custom_score(row)
    level = categorize(score)
    return score, level

# ๐Ÿงช 8. Example Prediction
user_input = {
    'ph': 7.2,                 # 6.5 โ€“ 8.5
    'Hardness': 100,           # 60 โ€“ 120 mg/L (as CaCOโ‚ƒ)
    'Solids': 400,             # < 500 mg/L
    'Chloramines': 2.5,        # < 4.0 mg/L
    'Sulfate': 210,            # < 250 mg/L
    'Conductivity': 700,       # < 800 ยตS/cm
    'Organic_carbon': 4,       # < 5 mg/L
    'Trihalomethanes': 70,     # < 80 ยตg/L
    'Turbidity': 3.5           # < 5 NTU (preferably < 1 NTU)
}

score, level = predict_potability(user_input)
print(f"Predicted Score: {score} / 100")
print(f"Water Quality: {level}")
Predicted Score: 100 / 100
Water Quality: Very Good
In [ ]: